Assessment of data quality and completeness in preparation for analysis
Three datasets were provided by Sprocket Central Pty Ltd which are: Customer Demographic, Customer Addresses, Transactions data in the past 3 months
#' Read file
transaction <- readxl::read_excel(path = file.path("temp/KPMG/KPMG_VI_New_raw_data_update_final.xlsx"),
sheet = 2,
skip =1)
customer_demographic <- readxl::read_excel(path = file.path("temp/KPMG/KPMG_VI_New_raw_data_update_final.xlsx"),
sheet = 4,
skip =1)
customer_address <- readxl::read_excel(path = file.path("temp/KPMG/KPMG_VI_New_raw_data_update_final.xlsx"),
sheet = 5,
skip =1)
Data quality meets seven dimensions: accuracy, completeness, consistency, currentcy, relevancy, validity, and uniqueness.
#' Transactions
summary(transaction)
## transaction_id product_id customer_id
## Min. : 1 Min. : 0.00 Min. : 1.0
## 1st Qu.: 5001 1st Qu.: 18.00 1st Qu.: 857.8
## Median :10000 Median : 44.00 Median :1736.0
## Mean :10000 Mean : 45.36 Mean :1738.2
## 3rd Qu.:15000 3rd Qu.: 72.00 3rd Qu.:2613.0
## Max. :20000 Max. :100.00 Max. :5034.0
##
## transaction_date online_order order_status
## Min. :2017-01-01 00:00:00 Mode :logical Length:20000
## 1st Qu.:2017-04-01 00:00:00 FALSE:9811 Class :character
## Median :2017-07-03 00:00:00 TRUE :9829 Mode :character
## Mean :2017-07-01 14:08:05 NA's :360
## 3rd Qu.:2017-10-02 00:00:00
## Max. :2017-12-30 00:00:00
##
## brand product_line product_class product_size
## Length:20000 Length:20000 Length:20000 Length:20000
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## list_price standard_cost product_first_sold_date
## Min. : 12.01 Min. : 7.21 Min. :33259
## 1st Qu.: 575.27 1st Qu.: 215.14 1st Qu.:35667
## Median :1163.89 Median : 507.58 Median :38216
## Mean :1107.83 Mean : 556.05 Mean :38200
## 3rd Qu.:1635.30 3rd Qu.: 795.10 3rd Qu.:40672
## Max. :2091.47 Max. :1759.85 Max. :42710
## NA's :197 NA's :197
#' Customer Demographics
summary(customer_demographic)
## customer_id first_name last_name gender
## Min. : 1 Length:4000 Length:4000 Length:4000
## 1st Qu.:1001 Class :character Class :character Class :character
## Median :2000 Mode :character Mode :character Mode :character
## Mean :2000
## 3rd Qu.:3000
## Max. :4000
##
## past_3_years_bike_related_purchases DOB job_title
## Min. : 0.00 Length:4000 Length:4000
## 1st Qu.:24.00 Class :character Class :character
## Median :48.00 Mode :character Mode :character
## Mean :48.89
## 3rd Qu.:73.00
## Max. :99.00
##
## job_industry_category wealth_segment deceased_indicator default
## Length:4000 Length:4000 Length:4000 Length:4000
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## owns_car tenure
## Length:4000 Min. : 1.00
## Class :character 1st Qu.: 6.00
## Mode :character Median :11.00
## Mean :10.66
## 3rd Qu.:15.00
## Max. :22.00
## NA's :87
#' Customer Address
summary(customer_address)
## customer_id address postcode state
## Min. : 1 Length:3999 Min. :2000 Length:3999
## 1st Qu.:1004 Class :character 1st Qu.:2200 Class :character
## Median :2004 Mode :character Median :2768 Mode :character
## Mean :2004 Mean :2986
## 3rd Qu.:3004 3rd Qu.:3750
## Max. :4003 Max. :4883
## country property_valuation
## Length:3999 Min. : 1.000
## Class :character 1st Qu.: 6.000
## Mode :character Median : 8.000
## Mean : 7.514
## 3rd Qu.:10.000
## Max. :12.000
#' Transactions
transaction %>% dlookr::diagnose() %>%
tidytable::select.(-unique_count, -unique_rate) %>%
tidytable::filter.(missing_count > 0) %>%
tidytable::arrange.(desc(missing_count)) %>%
reactable::reactable()
## Warning in fun(libname, pkgname): couldn't connect to display ":0"
#' Transactions
customer_demographic %>% dlookr::diagnose() %>%
tidytable::select.(-unique_count, -unique_rate) %>%
tidytable::filter.(missing_count > 0) %>%
tidytable::arrange.(desc(missing_count)) %>%
reactable::reactable()
#' Transactions
customer_address %>% dlookr::diagnose() %>%
tidytable::select.(-unique_count, -unique_rate) %>%
tidytable::filter.(missing_count > 0) %>%
tidytable::arrange.(desc(missing_count)) %>%
reactable::reactable()
#' Table col
list(transaction, customer_address, customer_demographic) %>% purrr::map(~ (names(.))) %>%
set_names(nm = c("transaction", "customer_address", "customer_demographic"))
## $transaction
## [1] "transaction_id" "product_id"
## [3] "customer_id" "transaction_date"
## [5] "online_order" "order_status"
## [7] "brand" "product_line"
## [9] "product_class" "product_size"
## [11] "list_price" "standard_cost"
## [13] "product_first_sold_date"
##
## $customer_address
## [1] "customer_id" "address" "postcode"
## [4] "state" "country" "property_valuation"
##
## $customer_demographic
## [1] "customer_id" "first_name"
## [3] "last_name" "gender"
## [5] "past_3_years_bike_related_purchases" "DOB"
## [7] "job_title" "job_industry_category"
## [9] "wealth_segment" "deceased_indicator"
## [11] "default" "owns_car"
## [13] "tenure"
#' Plot
table_dm <- dm::dm(transaction, customer_demographic, customer_address)
table_dm %>%
dm::dm_add_pk(transaction, customer_id) %>%
dm::dm_add_pk(customer_address, customer_id) %>%
dm::dm_add_pk(customer_demographic, customer_id) %>%
dm::dm_add_fk(transaction, customer_id, customer_address) %>%
dm::dm_add_fk(transaction, customer_id, customer_demographic) %>% dm::dm_draw(view_type = "all")
#' Transactions
transaction_validate <- transaction %>% tidytable::as_tidytable() %>%
tidytable::mutate.(
transaction_date = as.Date(transaction_date),
product_first_sold_date = as.Date(product_first_sold_date, origin = "1899-12-30")) %>%
tidytable::arrange.(-product_first_sold_date)
transaction_validate %>% reactable::reactable()
#### 2.4.2. Customer Demographics
## Warning in as.Date(as.numeric(DOB), origin = "1899-12-30"): NAs introduced by
## coercion
#' Transactions
transaction %>% dlookr::diagnose() %>%
tidytable::select.(-missing_count,-missing_percent) %>%
tidytable::arrange.(-unique_count) %>% reactable::reactable()
#' Customer Demographics
customer_demographic %>% dlookr::diagnose() %>%
tidytable::select.(-missing_count,-missing_percent) %>%
tidytable::arrange.(-unique_count) %>% reactable::reactable()
#' Customer Address
customer_address %>% dlookr::diagnose() %>%
tidytable::select.(-missing_count,-missing_percent) %>%
tidytable::arrange.(-unique_count) %>% reactable::reactable()